# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
from east_money.items import EastMoneyItem
import requests
import json
import time

class EastSpider(scrapy.Spider):
    name = "east"
    allowed_domains = ["stock.eastmoney.com"]
    start_urls = []
    item = EastMoneyItem()
    intab = "?*/\|.:><"
    outtab = "         "
    trantab = str.maketrans(intab, outtab)  # 制作字符串翻译表，去掉标题中带有的系统敏感字符

    # 更新：通过接口获取新闻列表
    def start_requests(self):
        base_url = 'https://np-listapi.eastmoney.com/comm/web/getNewsByColumns?client=web&biz=web_news_col&column=611&order=1&needInteractData=0&page_index=1&page_size=50&req_trace={}'
        timestamp = int(time.time() * 1000)
        start_url = base_url.format(timestamp)
        data = str(requests.get(url=start_url).content, encoding='utf-8')
        parsed_data = json.loads(data)
        if parsed_data is None:
            print('parsed_data is None')
        else:
            for ele in parsed_data["data"]["list"]:
                url = ele["url"]
                yield scrapy.Request(url=url)

    def parse(self, response):
        self.item['href'] = response.url
        data = response.text
        #更新：解析接口返回的新闻列表
        xpath_tree = etree.HTML(data)
        # 更新：获取文章标题的方法
        title = xpath_tree.xpath('//title')
        if title:
            pass
        else:
            title = xpath_tree.xpath('/html/body/div[1]/div[3]/div/div[1]/div[1]/div/div[2]/h1')
        title_content = title[0].text.translate(self.trantab)
        self.item['title'] = title_content  # 存储每条新闻的title
        text = xpath_tree.xpath('//*[@id="ContentBody"]')  # 获取正文内容
        if text:
            pass
        else:
            text = xpath_tree.xpath('/html/body/div[1]/div[3]/div/div[1]/div[1]/div/div[3]')
        text_content = text[0]
        summary = text_content.xpath('./div[@class="b-review"]')  # 获取摘要
        if len(summary) > 0:  # 判断摘要是否存在
            summary_content = summary[0].text.strip()
            self.item['summary'] = summary_content
        else:
            self.item['summary'] = '该新闻没有摘要'
        p_list = text_content.xpath('./p')  # 获取段落
        content = ''
        for p in p_list:
            p_content = p.xpath('string(.)').strip() + '\n'
            # print(p_content)
            content = content + p_content  # 把段落拼接成一个字符串，每段一行
        self.item['content'] = content
        yield self.item
